In [1]:
import eda_utility as u
import predictive_utility as p
import pandas as pd
%matplotlib inline
In [2]:
# import datasets
calendar = pd.read_csv('Seattle_data/calendar.csv')
listings = pd.read_csv('Seattle_data/listings.csv')
reviews = pd.read_csv('Seattle_data/reviews.csv')
In [4]:
# EDA for seattle
# busy times
month_count = u.busiest_time(calendar, 'Seattle')
month_count
Out[4]:
month count
11 12 87061
2 3 83938
9 10 82438
10 11 81780
4 5 79971
8 9 77246
5 6 77244
7 8 76347
3 4 76037
6 7 74222
1 2 73321
0 1 64937
In [6]:
# Time series analysis
u.time_series_analysis(calendar, 'Seattle')
In [6]:
u.weekday_decomposition(calendar)
In [3]:
# Sentiment Analysis
sentiment_df = u.sentiment_analysis(reviews)
In [6]:
u.sentiment_distribution(sentiment_df)
Sentiment distribution table:
                    index  sentiment
0                positive   0.960368
1  neutral/not applicable   0.020653
2                negative   0.018979
In [5]:
u.sentiment_word_plots(sentiment_df)
In [5]:
# listing distribution visualization
# seattle location
seattle_location = [47.6062, -122.3321]

listing_map = u.listing_distribution_map(listings, seattle_location)
listing_map
============================================Visualization on Map==============================================

Out[5]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [4]:
neighbourhood_map = u.listing_count_map(listings, seattle_location)
neighbourhood_map
============================================Visualization on Map==============================================

Out[4]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [14]:
# Predictive modeling for Airbnb 'price' feature
# Clean dataset
df = p.clean_listings(listings)

# Modeling
# Initial model
X, y= p.initial_model(df, test_size=.30, rand_state=42)
The initial model performance for testing set
--------------------------------------
MAE is 33.04880812289627
MSE is 3035.170499675571
R2 score is 0.6458371086072899
In [9]:
# Find the optimal model by changing training features
cutoffs = [3500, 3000, 2500, 2000, 1000, 500, 100, 50, 30, 25]
best_r2_test, best_r2_train, lm_model, X_train, X_test, y_train, y_test = p.find_optimal_lm_model(X, y, cutoffs)
The model performance for optimal linear regression model
------------------------------------------------------------
Number of Features: 37
R2 score for test set: 0.6465455453531974
R2 score for training set: 0.6451917400822842
In [11]:
# feature importance
coef_df = p.coef_weights(lm_model, X_train)
coef_df.head(10)
Out[11]:
predictors coefs abs_coefs
24 host_acceptance_rate -71462.176099 71462.176099
32 room_type_Entire home/apt 51.613111 51.613111
33 room_type_Private room 24.730468 24.730468
3 bathrooms 21.098293 21.098293
4 bedrooms 14.679835 14.679835
31 host_is_superhost_f -8.498724 8.498724
19 review_scores_location 7.694059 7.694059
2 accommodates 6.784409 6.784409
17 review_scores_checkin -6.598437 6.598437
20 review_scores_value -6.315328 6.315328